{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = '0'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-10-04 23:00:26.623149: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2023-10-04 23:00:26.692624: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
      "2023-10-04 23:00:27.082651: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n",
      "2023-10-04 23:00:27.082697: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n",
      "2023-10-04 23:00:27.082700: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n",
      "Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoTokenizer, T5ForConditionalGeneration\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained('mesolitica/nanot5-base-malaysian-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['nanot5-small-malaysian-cased/checkpoint-1160000',\n",
       " 'nanot5-small-malaysian-cased/checkpoint-1170000',\n",
       " 'nanot5-small-malaysian-cased/checkpoint-1180000']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "\n",
    "checkpoints = sorted(glob('nanot5-small-malaysian-cased/checkpoint-*'))\n",
    "checkpoints"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])\n",
    "model.config.eos_token_id = tokenizer.eos_token_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "_ = model.cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['ساي تق سوك ايم']\n"
     ]
    }
   ],
   "source": [
    "s = \"\"\"\n",
    "saya tak suka ayam\n",
    "\"\"\".strip()\n",
    "input_ids = tokenizer.encode(f'terjemah ke Jawi: {s}{tokenizer.eos_token}', return_tensors = 'pt').cuda()\n",
    "outputs = model.generate(input_ids, max_length = 100)\n",
    "print(tokenizer.batch_decode(outputs, skip_special_tokens = True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "9999it [17:34,  9.48it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "import json\n",
    "\n",
    "jawi_left, jawi_right = [], []\n",
    "rumi_left, rumi_right = [], []\n",
    "\n",
    "with open('jawi-rumi-news-full.test') as fopen:\n",
    "    for l in tqdm(fopen):\n",
    "        l = json.loads(l)\n",
    "        input_ids = tokenizer.encode(f'terjemah ke Jawi: {l[0]}{tokenizer.eos_token}', return_tensors = 'pt').cuda()\n",
    "        outputs = model.generate(input_ids, max_length = 2048)\n",
    "        jawi_left.append(tokenizer.batch_decode(outputs, skip_special_tokens = True)[0])\n",
    "        jawi_right.append(l[1])\n",
    "        \n",
    "        input_ids = tokenizer.encode(f'terjemah ke Rumi: {l[1]}{tokenizer.eos_token}', return_tensors = 'pt').cuda()\n",
    "        outputs = model.generate(input_ids, max_length = 2048)\n",
    "        rumi_left.append(tokenizer.batch_decode(outputs, skip_special_tokens = True)[0])\n",
    "        rumi_right.append(l[0])\n",
    "        \n",
    "        if len(rumi_left) >= 10000:\n",
    "            break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sacrebleu.metrics import BLEU, CHRF, TER\n",
    "\n",
    "chrf = CHRF(word_order = 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "chrF2++ = 98.97"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chrf.corpus_score(jawi_left, [jawi_right])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "chrF2++ = 98.01"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chrf.corpus_score(rumi_left, [rumi_right])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
